#setwd('/afs/inf.ed.ac.uk/user/s17/s1725186/Documents/PhD-Models/FirstPUModel/RMarkdowns')

library(tidyverse) ; library(reshape2) ; library(glue) ; library(plotly) ; library(dendextend)
library(RColorBrewer) ; library(viridis) ; require(gridExtra) ; library(GGally)
library(knitr)
library(biomaRt)
library(clusterProfiler) ; library(ReactomePA) ; library(DOSE) ; library(org.Hs.eg.db)

Load preprocessed dataset (preprocessing code in 19_10_14_data_preprocessing.Rmd) and clustering (pipeline in 19_10_21_WGCNA.Rmd)

# Gandal dataset
load('./../Data/preprocessed_data.RData')
datExpr = datExpr %>% data.frame
DE_info = DE_info %>% data.frame


# GO Neuronal annotations: regex 'neuron' in GO functional annotations and label the genes that make a match as neuronal
GO_annotations = read.csv('./../Data/genes_GO_annotations.csv')
GO_neuronal = GO_annotations %>% filter(grepl('neuron', go_term)) %>% 
              mutate('ID'=as.character(ensembl_gene_id)) %>% 
              dplyr::select(-ensembl_gene_id) %>% distinct(ID) %>%
              mutate('Neuronal'=1)


# SFARI Genes
SFARI_genes = read_csv('./../../../SFARI/Data/SFARI_genes_01-03-2020_w_ensembl_IDs.csv')
SFARI_genes = SFARI_genes[!duplicated(SFARI_genes$ID) & !is.na(SFARI_genes$ID),]


# Clusterings
clusterings = read_csv('./../Data/clusters.csv')


# Update DE_info with SFARI and Neuronal information
genes_info = DE_info %>% mutate('ID'=rownames(.)) %>% left_join(SFARI_genes, by='ID') %>% 
  mutate(`gene-score`=ifelse(is.na(`gene-score`), 'Others', `gene-score`)) %>%
  left_join(GO_neuronal, by='ID') %>% left_join(clusterings, by='ID') %>%
  mutate(Neuronal=ifelse(is.na(Neuronal), 0, Neuronal)) %>%
  mutate(gene.score=ifelse(`gene-score`=='Others' & Neuronal==1, 'Neuronal', `gene-score`), 
         significant=padj<0.05 & !is.na(padj))


clustering_selected = 'DynamicHybrid'
genes_info$Module = genes_info[,clustering_selected]

dataset = read.csv(paste0('./../Data/dataset_', clustering_selected, '.csv'))
dataset$Module = dataset[,clustering_selected]

# Correct SFARI Scores
dataset$gene.score = genes_info$gene.score


SFARI_colour_hue = function(r) {
  pal = c('#FF7631','#FFB100','#E8E328','#8CC83F','#62CCA6','#59B9C9','#b3b3b3','#808080','gray','#d9d9d9')[r]
}

rm(DE_info, GO_annotations, clusterings, getinfo, mart, dds, GO_neuronal)


Gene Set Enrichment Analysis


Using the package clusterProfiler. Performing Gene Set Enrichment Analysis using the following datasets:

file_name = './../Data/GSEA_bonferroni.RData'

if(file.exists(file_name)){
  load(file_name)
} else {
  ##############################################################################
  # PREPARE DATASET
  
  # Create dataset with top modules membership and removing the genes without an assigned module
  EA_dataset = data.frame('ensembl_gene_id' = genes_info$ID, module = genes_info$Module)  %>% 
               filter(genes_info$Module!='gray')
  
  # Assign Entrez Gene Id to each gene
  getinfo = c('ensembl_gene_id','entrezgene')
  mart = useMart(biomart='ENSEMBL_MART_ENSEMBL', dataset='hsapiens_gene_ensembl', 
                 host='feb2014.archive.ensembl.org')
  biomart_output = getBM(attributes=getinfo, filters=c('ensembl_gene_id'), 
                         values=genes_info$ID[genes_info$Module!='gray'], mart=mart)
  
  EA_dataset = biomart_output %>% dplyr::rename('ID' = ensembl_gene_id) %>%
               left_join(dataset %>% dplyr::select(ID, contains('MM.')), by='ID')

  
  ##############################################################################
  # PERFORM ENRICHMENT
  
  # Following https://yulab-smu.github.io/clusterProfiler-book/chapter8.html
  
  modules = dataset$Module[dataset$Module!='gray'] %>% as.character %>% table %>% names
  nPerm = 1e5 # 100 times more than the default
  
  enrichment_GO = list()         # Gene Ontology
  enrichment_DO = list()         # Disease Ontology
  enrichment_DGN = list()        # Disease Gene Networks
  enrichment_KEGG = list()       # Kyoto Encyclopedia of Genes and Genomes
  enrichment_Reactome = list()   # Reactome: Pathway db
  
  
  for(module in modules){
    cat('\n')
    cat(paste0('Module: ', which(modules == module), '/', length(modules)))
    geneList = EA_dataset[,paste0('MM.',substring(module,2))]
    names(geneList) = EA_dataset[,'entrezgene'] %>% as.character
    geneList = sort(geneList, decreasing = TRUE)
    
    enrichment_GO[[module]] = gseGO(geneList, OrgDb = org.Hs.eg.db, pAdjustMethod = 'bonferroni', 
                                    pvalueCutoff = 0.1, nPerm = nPerm, verbose = FALSE, seed = TRUE) %>% 
                              data.frame
    enrichment_DO[[module]] = gseDO(geneList, pAdjustMethod = 'bonferroni', pvalueCutoff = 0.1,
                                    nPerm = nPerm, verbose = FALSE, seed = TRUE) %>% data.frame
    enrichment_DGN[[module]] = gseDGN(geneList, pAdjustMethod = 'bonferroni', pvalueCutoff = 0.1,
                                      nPerm = nPerm, verbose = FALSE, seed = TRUE) %>% data.frame
    enrichment_KEGG[[module]] = gseKEGG(geneList, organism = 'human', pAdjustMethod = 'bonferroni', 
                                        pvalueCutoff = 0.1, nPerm = nPerm, verbose = FALSE, seed = TRUE) %>% 
                                data.frame
    enrichment_Reactome[[module]] = gsePathway(geneList, organism = 'human', pAdjustMethod = 'bonferroni', 
                                               pvalueCutoff = 0.1, nPerm = nPerm, verbose = F, seed = T) %>% 
                                    data.frame
    
    # Temporal save, just in case SFARI Genes enrichment fails
    save(enrichment_GO, enrichment_DO, enrichment_DGN, enrichment_KEGG, enrichment_Reactome, file=file_name)
  }
  
  
  ##############################################################################
  # PERFROM ENRICHMENT FOR SFARI GENES
  
  # BUILD MAPPING BETWEEN GENES AND SFARI

  # Build TERM2GENE: dataframe of 2 columns with term and gene
  term2gene = biomart_output %>% 
              left_join(genes_info %>% dplyr::select(ID, `gene-score`), 
                         by = c('ensembl_gene_id'='ID')) %>% filter(`gene-score`!='Others') %>%
              dplyr::select(-ensembl_gene_id) %>% 
              mutate('SFARI' = 'SFARI', `gene-score` = paste0('SFARI Score ',`gene-score`)) %>%
              melt(id.vars = 'entrezgene') %>% dplyr::select(value, entrezgene) %>% 
              dplyr::rename('term' = value, 'gene' = entrezgene) %>% distinct
  
  
  # PERFORM GSEA
  enrichment_SFARI = list()
  cat('\n\nORA OF SFARI GENES\n')
  
  for(module in modules){
    cat('\n')
    cat(paste0('Module: ', which(modules == module), '/', length(modules)))
    geneList = EA_dataset[,paste0('MM.',substring(module,2))]
    names(geneList) = EA_dataset[,'entrezgene'] %>% as.character
    geneList = sort(geneList, decreasing = TRUE)
      
    enrichment_SFARI[[module]] = clusterProfiler::GSEA(geneList, pAdjustMethod = 'bonferroni',  nPerm = nPerm,
                                                       TERM2GENE = term2gene, pvalueCutoff = 1, maxGSSize=1000,
                                                        verbose = FALSE, seed = TRUE) %>% data.frame
    
    # Temporal save
    save(enrichment_GO, enrichment_DO, enrichment_DGN, enrichment_KEGG, enrichment_Reactome, 
         enrichment_SFARI, file=file_name)
  }

  ##############################################################################
  # Save enrichment results
  save(enrichment_GO, enrichment_DO, enrichment_DGN, enrichment_KEGG, enrichment_Reactome, 
       enrichment_SFARI, file=file_name)
  
  rm(getinfo, mart, biomart_output, gene, module, term2gene, geneList, EA_dataset)
}



Enrichment in SFARI Genes by Module

SFARI_genes_by_module = c()
for(module in names(enrichment_SFARI)){
  module_info = enrichment_SFARI[[module]] %>% mutate(Module = module) %>% dplyr::select(Module, ID, pvalue, p.adjust, NES) %>%
                mutate(pvalue = ifelse(NES>0, pvalue, 1-pvalue), p.adjust = ifelse(NES>0, p.adjust, 1-p.adjust))
  SFARI_genes_by_module = rbind(SFARI_genes_by_module, module_info)
}

SFARI_genes_by_module = SFARI_genes_by_module %>% left_join(dataset %>% dplyr::select(Module, MTcor) %>% 
                                                            group_by(Module,MTcor) %>% tally %>% ungroup, by = 'Module')

plot_data = SFARI_genes_by_module %>% filter(ID == 'SFARI')

ggplotly(plot_data %>% ggplot(aes(MTcor, pvalue, size=n)) + geom_point(color=plot_data$Module, alpha=0.5, aes(id=Module)) + 
         geom_smooth(color='#cccccc', size = 0.5, se=FALSE) + xlab('Module-Diagnosis Correlation') + ylab('Probability') + 
         ggtitle(paste0('
Corr = ', round(cor(plot_data$MTcor, plot_data$pvalue),2), ': Corr[Module-ASD corr<0] = ', 
                        round(cor(plot_data$MTcor[plot_data$MTcor<0], plot_data$pvalue[plot_data$MTcor<0]),3),
                        ' Corr[Module-ASD corr>0] = ',
                        round(cor(plot_data$MTcor[plot_data$MTcor>=0], plot_data$pvalue[plot_data$MTcor>=0]),2))) +
         theme_minimal() + theme(legend.position = 'none') + scale_y_sqrt())
plot_data = SFARI_genes_by_module %>% filter (ID != 'SFARI') %>% mutate(color = SFARI_colour_hue(r = 1:3)[ID %>% substr(12,13) %>% as.numeric])

ggplotly(plot_data %>% ggplot(aes(MTcor, pvalue, size=n, color = color)) + geom_point(alpha=0.5, aes(id=Module)) + geom_smooth(size = 0.5, se=FALSE) +
         xlab('Module-Diagnosis Correlation') + scale_colour_manual(values = SFARI_colour_hue(r=1:3)) + ylab('Probability') + 
         ggtitle('Enrichment by SFARI Gene Score') + theme_minimal() + theme(legend.position = 'none'))
plot_data = SFARI_genes_by_module %>% filter(ID == 'SFARI')

ggplotly(plot_data %>% ggplot(aes(MTcor, p.adjust, size=n)) + geom_point(color=plot_data$Module, alpha=0.5, aes(id=Module)) +
         geom_hline(yintercept = 0.05, color = 'gray', linetype = 'dotted') + xlab('Module-Diagnosis Correlation') + ylab('Corrected p-values') + scale_y_log10() +
         theme_minimal() + theme(legend.position = 'none'))

How can there be so many enriched modules?! it makes no sense unless SFARI Genes tend to have more extreme Module Memberships than the other genes, which would get them to the top of the sorted list easier than the rest

plot_data = dataset %>% dplyr::select(gene.score, contains('MM.')) %>% melt %>% 
            mutate(quant = cut(value, breaks = quantile(value, probs = seq(0,1,0.05)) %>% as.vector, labels = FALSE)) %>%
            group_by(gene.score, quant) %>% tally %>% ungroup %>% ungroup
  
plot_data = plot_data %>% group_by(quant) %>% summarise(N = sum(n)) %>% ungroup %>% left_join(plot_data, by = 'quant') %>%
            dplyr::select(quant, gene.score, n, N) %>% mutate(p = round(100*n/N,2)) %>% filter(!is.na(quant)) %>%
            mutate(gene.score = factor(gene.score, levels = rev(c('1','2','3','Neuronal','Others'))))

ggplotly(plot_data %>% filter(!gene.score %in% c('Neuronal','Others')) %>% ggplot(aes(quant, p, fill = gene.score)) + 
         geom_bar(stat='identity') + xlab('Module Membership Quantiles') + ylab('% of SFARI Genes') + #geom_smooth(color = 'gray', se = FALSE) +
         scale_fill_manual(values = SFARI_colour_hue(r=rev(c(1:3)))) + theme_minimal() + theme(legend.position = 'none'))

Is GSEA the right approach to measure functional/pathway enrichment in modules?

  • In general, the higher the Module Membership of a gene, the more likely it is to be assigned to a module, but there are exceptions

  • Ordering defined by Module Membership is a continuous scale, it doesn’t indicate where the module ends

modules = dataset$Module[dataset$Module!='gray'] %>% as.character %>% table %>% names

plot_data = dataset %>% dplyr::select(Module, contains('MM.')) %>% melt %>% mutate(in_module = substring(Module,2) == substring(variable,4)) %>%
            mutate(alpha = ifelse(in_module, 0.8, 0.2))
## Using Module as id variables
p = plot_data %>% filter(Module == modules[1]) %>% ggplot(aes(Module, value, color = in_module)) + 
    geom_jitter(alpha = plot_data$alpha[plot_data$Module == modules[1]]) + theme_minimal() + 
    theme(legend.position = 'none') + xlab('') + ylab('Module Membership') + coord_flip()
ggExtra::ggMarginal(p, type = 'density', groupColour = TRUE, groupFill = TRUE, margins = 'x')